#Importing libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import spacy
#Loading data
data = pd.read_csv('/content/train.csv')
df = pd.DataFrame(data)
df.head()
| Class Index | Title | Description | |
|---|---|---|---|
| 0 | 3 | Wall St. Bears Claw Back Into the Black (Reuters) | Reuters - Short-sellers, Wall Street's dwindli... |
| 1 | 3 | Carlyle Looks Toward Commercial Aerospace (Reu... | Reuters - Private investment firm Carlyle Grou... |
| 2 | 3 | Oil and Economy Cloud Stocks' Outlook (Reuters) | Reuters - Soaring crude prices plus worries\ab... |
| 3 | 3 | Iraq Halts Oil Exports from Main Southern Pipe... | Reuters - Authorities have halted oil export\f... |
| 4 | 3 | Oil prices soar to all-time record, posing new... | AFP - Tearaway world oil prices, toppling reco... |
#Preprocessing using NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
#Remove symbols and URLs
text = re.sub(r'http\S+', '', text)
text = re.sub(r'[^\w\s]', '', text)
#Tokenization
words = word_tokenize(text.lower())
#Lemmatization and remove stopwords
words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
return ' '.join(words)
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip. [nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip. [nltk_data] Downloading package wordnet to /root/nltk_data...
# Preprocess the data
df['Description'] = df['Description'].apply(preprocess_text)
df.head()
| Class Index | Title | Description | |
|---|---|---|---|
| 0 | 3 | Wall St. Bears Claw Back Into the Black (Reuters) | reuters shortsellers wall street dwindlingband... |
| 1 | 3 | Carlyle Looks Toward Commercial Aerospace (Reu... | reuters private investment firm carlyle groupw... |
| 2 | 3 | Oil and Economy Cloud Stocks' Outlook (Reuters) | reuters soaring crude price plus worriesabout ... |
| 3 | 3 | Iraq Halts Oil Exports from Main Southern Pipe... | reuters authority halted oil exportflows main ... |
| 4 | 3 | Oil prices soar to all-time record, posing new... | afp tearaway world oil price toppling record s... |
Count Vectorizer
# Vectorize Data using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
count_vectors = count_vectorizer.fit_transform(df['Description'])
# Print the shape and a small sample of the output
print("Shape of CountVectorizer output:", count_vectors.shape)
print("Sample of CountVectorizer output:\n", count_vectors[0:5])
Shape of CountVectorizer output: (120000, 76755) Sample of CountVectorizer output: (0, 56678) 1 (0, 60813) 1 (0, 73522) 1 (0, 64309) 1 (0, 19043) 1 (0, 70976) 1 (0, 59619) 1 (0, 26958) 1 (1, 56678) 1 (1, 50953) 1 (1, 32584) 1 (1, 23094) 1 (1, 10458) 1 (1, 27205) 1 (1, 56290) 1 (1, 38801) 1 (1, 74314) 1 (1, 45302) 1 (1, 49506) 1 (1, 16289) 1 (1, 31499) 1 (1, 52252) 1 (1, 49378) 1 (1, 6989) 1 (1, 3225) 1 : : (3, 55126) 1 (3, 40995) 1 (3, 14436) 1 (3, 64379) 1 (3, 45532) 1 (3, 58159) 1 (3, 58641) 1 (4, 50830) 1 (4, 45795) 1 (4, 1053) 1 (4, 66753) 1 (4, 75488) 1 (4, 69276) 1 (4, 55334) 1 (4, 64240) 1 (4, 73529) 1 (4, 50674) 1 (4, 43810) 1 (4, 19355) 1 (4, 40354) 1 (4, 5969) 1 (4, 68233) 1 (4, 41846) 1 (4, 50707) 1 (4, 19729) 1
TFIDF Vectorizer
# Vectorize Data using TFIDFVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectors = tfidf_vectorizer.fit_transform(df['Description'])
# Print the shape and a small sample of the output
print("Shape of TFIDFVectorizer output:", tfidf_vectors.shape)
print("Sample of TFIDFVectorizer output:\n", tfidf_vectors[0:5])
Shape of TFIDFVectorizer output: (120000, 76755) Sample of TFIDFVectorizer output: (0, 26958) 0.26120760429720336 (0, 59619) 0.3268992877359646 (0, 70976) 0.4788872172220486 (0, 19043) 0.4956310331859318 (0, 64309) 0.22095216909926335 (0, 73522) 0.23237227130027355 (0, 60813) 0.4788872172220486 (0, 56678) 0.13332162218413063 (1, 39267) 0.13210831516292193 (1, 47854) 0.15116228728107386 (1, 3225) 0.14812230327410614 (1, 6989) 0.2365271868732828 (1, 49378) 0.3495855390259896 (1, 52252) 0.2293744496364001 (1, 31499) 0.1478922763239179 (1, 16289) 0.17000534423398342 (1, 49506) 0.15785065511530497 (1, 45302) 0.3495855390259896 (1, 74314) 0.3377755522875477 (1, 38801) 0.1693698205965034 (1, 56290) 0.23131697004727522 (1, 27205) 0.3495855390259896 (1, 10458) 0.3057762598690284 (1, 23094) 0.15335782709646312 (1, 32584) 0.17525469924436288 : : (3, 49256) 0.24250881461709184 (3, 38694) 0.19916032038236778 (3, 21684) 0.3876754926577527 (3, 45795) 0.2856449171846865 (3, 27786) 0.2728355586005189 (3, 5119) 0.18785714065780482 (3, 56678) 0.10428226261363707 (4, 19729) 0.167090831117269 (4, 50707) 0.1887230402755787 (4, 41846) 0.1510916010717379 (4, 68233) 0.14857343286907976 (4, 5969) 0.26246899772780335 (4, 40354) 0.34170299112097874 (4, 19355) 0.18903741377828379 (4, 43810) 0.10373034430655782 (4, 50674) 0.2636133878691397 (4, 73529) 0.2985165578783655 (4, 64240) 0.3374206498987674 (4, 55334) 0.1649815824424969 (4, 69276) 0.31914751140462877 (4, 75488) 0.13100900914048855 (4, 66753) 0.41064103518077777 (4, 1053) 0.16620174854673797 (4, 45795) 0.1512831307477046 (4, 50830) 0.14692025741852865
Word2Vec
# Vectorize Data using Word2Vec
import gensim
from gensim.models import Word2Vec
#Tokenize the text
tokenized_text = df['Description'].apply(lambda x: x.split())
# Train a Word2Vec model
word2vec_model = Word2Vec(tokenized_text, min_count=1)
word2vec_model.train(tokenized_text, total_examples=len(tokenized_text), epochs=10)
# Create Word2Vec vectors for the text column
word2vec_vectors = tokenized_text.apply(lambda x: [word2vec_model.wv[word] for word in x])
# Print the shape and a small sample of the output
print("Sample of Word2Vec output:", word2vec_vectors.head())
WARNING:gensim.models.word2vec:Effective 'alpha' higher than previous training cycles
Sample of Word2Vec output: 0 [[-1.1920155, -2.1841326, -0.4772949, 0.183996... 1 [[-1.1920155, -2.1841326, -0.4772949, 0.183996... 2 [[-1.1920155, -2.1841326, -0.4772949, 0.183996... 3 [[-1.1920155, -2.1841326, -0.4772949, 0.183996... 4 [[-0.61951846, -1.9208267, -1.0441942, 1.53346... Name: Description, dtype: object
Google Word2Vec Model
#Mount google drive
from google.colab import drive
drive.mount('/content/drive/')
Mounted at /content/drive/
#Load the GoogleNews Word2Vec Model
google_model = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/MyDrive/GoogleNews-vectors-negative300.bin", binary=True)
# Create GoogleNews Word2Vec vectors for the text column
google_w2v_vectors = tokenized_text.apply(lambda x: [google_model[word] for word in x if word in google_model])
# Print the shape and sample of the output
print("Sample of GoogleNews Word2Vec output:", google_w2v_vectors.head())
Sample of GoogleNews Word2Vec output: 0 [[-0.052001953, 0.061767578, -0.13671875, -0.1... 1 [[-0.052001953, 0.061767578, -0.13671875, -0.1... 2 [[-0.052001953, 0.061767578, -0.13671875, -0.1... 3 [[-0.052001953, 0.061767578, -0.13671875, -0.1... 4 [[0.125, 0.06982422, -0.08886719, -0.10644531,... Name: Description, dtype: object
def mean_vector(words):
# Filter out words that are not in the Word2Vec model's vocabulary
valid_words = [word for word in words if word in word2vec_model.wv]
if valid_words:
# Calculate the mean vector for valid words
vectors = [word2vec_model.wv[word] for word in valid_words]
return np.mean(vectors, axis=0)
else:
# Return a zero vector if no valid words are found
return np.zeros(word2vec_model.vector_size)
# Apply the function to the tokenized text
X_train_w2v = np.array(X_train_preprocessed.apply(mean_vector).tolist())
def mean_vector(words):
# Filter out words that are not in the Word2Vec model's vocabulary
valid_words = [word for word in words if word in word2vec_model.wv]
if valid_words:
# Calculate the mean vector for valid words
vectors = [word2vec_model.wv[word] for word in valid_words]
return np.mean(vectors, axis=0)
else:
# Return a zero vector if no valid words are found
return np.zeros(word2vec_model.vector_size)
#%%
# Apply the function to the tokenized text
word2vec_mean_vectors = np.array(tokenized_text.apply(mean_vector).tolist())
#Importing the libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
#Logistic Regression with CountVectorizer
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(count_vectors, df['Class Index'], test_size=0.2, random_state=42)
# Initialize and train the Logistic Regression model
lr_model_count = LogisticRegression(max_iter=1000)
lr_model_count.fit(X_train, y_train)
# Make predictions and print the classification report
y_pred_count = lr_model_count.predict(X_test)
print("Classification Report for Logistic Regression with CountVectorizer:\n", classification_report(y_test, y_pred_count))
Classification Report for Logistic Regression with CountVectorizer:
precision recall f1-score support
1 0.90 0.89 0.90 5956
2 0.95 0.96 0.95 6058
3 0.86 0.87 0.86 5911
4 0.88 0.87 0.87 6075
accuracy 0.90 24000
macro avg 0.90 0.90 0.90 24000
weighted avg 0.90 0.90 0.90 24000
# Logistic Regression with TFIDFVectorizer
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors, df['Class Index'], test_size=0.2, random_state=42)
# Initialize and train the Logistic Regression model
lr_model_tfidf = LogisticRegression(max_iter=1000)
lr_model_tfidf.fit(X_train, y_train)
# Make predictions and print the classification report
y_pred_tfidf = lr_model_tfidf.predict(X_test)
print("Classification Report for Logistic Regression with TFIDFVectorizer:\n", classification_report(y_test, y_pred_tfidf))
Classification Report for Logistic Regression with TFIDFVectorizer:
precision recall f1-score support
1 0.92 0.89 0.91 5956
2 0.95 0.98 0.96 6058
3 0.87 0.87 0.87 5911
4 0.88 0.88 0.88 6075
accuracy 0.91 24000
macro avg 0.91 0.91 0.91 24000
weighted avg 0.91 0.91 0.91 24000
# Logistic Regression with Word2Vec
# Function to calculate the mean vector for each document
def mean_vector(words):
# Filter out words that are not in the Word2Vec model's vocabulary
valid_words = [word for word in words if word in word2vec_model.wv]
if valid_words:
# Calculate the mean vector for valid words
vectors = [word2vec_model.wv[word] for word in valid_words]
return np.mean(vectors, axis=0)
else:
# Return a zero vector if no valid words are found
return np.zeros(word2vec_model.vector_size)
#Apply the function to the tokenized text
word2vec_mean_vectors = np.array(tokenized_text.apply(mean_vector).tolist())
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(word2vec_mean_vectors, df['Class Index'], test_size=0.2, random_state=42)
# Initialize and train the Logistic Regression model
lr_model_word2vec = LogisticRegression(max_iter=1000)
lr_model_word2vec.fit(X_train, y_train)
# Make predictions and print the classification report
y_pred_word2vec = lr_model_word2vec.predict(X_test)
print("Classification Report for Logistic Regression with Word2Vec:\n", classification_report(y_test, y_pred_word2vec))
Classification Report for Logistic Regression with Word2Vec:
precision recall f1-score support
1 0.89 0.87 0.88 5956
2 0.94 0.96 0.95 6058
3 0.83 0.84 0.83 5911
4 0.85 0.83 0.84 6075
accuracy 0.88 24000
macro avg 0.88 0.88 0.88 24000
weighted avg 0.88 0.88 0.88 24000
# Function to calculate the mean vector for each document
def mean_vector_google_w2v(words):
# Filter out words that are not in the GoogleNews Word2Vec model's vocabulary
valid_words = [word for word in words if word in google_model]
if valid_words:
# Calculate the mean vector for valid words
vectors = [google_model[word] for word in valid_words]
return np.mean(vectors, axis=0)
else:
# Return a zero vector if no valid words are found
return np.zeros(google_model.vector_size)
# Apply the function to the tokenized text
google_w2v_mean_vectors = np.array(tokenized_text.apply(mean_vector_google_w2v).tolist())
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(google_w2v_mean_vectors, df['Class Index'], test_size=0.2, random_state=42)
# Initialize and train the Logistic Regression model
lr_model_google_w2v = LogisticRegression(max_iter=1000)
lr_model_google_w2v.fit(X_train, y_train)
# Make predictions and print the classification report
y_pred_google_w2v = lr_model_google_w2v.predict(X_test)
print("Classification Report for Logistic Regression with GoogleNews Word2Vec:\n", classification_report(y_test, y_pred_google_w2v))
Classification Report for Logistic Regression with GoogleNews Word2Vec:
precision recall f1-score support
1 0.89 0.88 0.89 5956
2 0.94 0.96 0.95 6058
3 0.83 0.85 0.84 5911
4 0.86 0.84 0.85 6075
accuracy 0.88 24000
macro avg 0.88 0.88 0.88 24000
weighted avg 0.88 0.88 0.88 24000
#Using SGD Classifier with CountVectorizer
# Importing the SGD Classifier
from sklearn.linear_model import SGDClassifier
# Split the data into training and testing sets and training the model
X_train, X_test, y_train, y_test = train_test_split(count_vectors, df['Class Index'], test_size=0.2, random_state=42)
sgd_model_count = SGDClassifier(loss='hinge', max_iter=10000)
sgd_model_count.fit(X_train, y_train)
# Make predictions and printing the classification report
y_pred_count_sgd = sgd_model_count.predict(X_test)
print("Classification Report for SGD Classifier with CountVectorizer:\n", classification_report(y_test, y_pred_count_sgd))
Classification Report for SGD Classifier with CountVectorizer:
precision recall f1-score support
1 0.92 0.89 0.91 5956
2 0.95 0.98 0.96 6058
3 0.87 0.88 0.87 5911
4 0.89 0.88 0.88 6075
accuracy 0.91 24000
macro avg 0.91 0.91 0.91 24000
weighted avg 0.91 0.91 0.91 24000
#SVM Classifier with TFIDFVectorizer
# Importing the libraries
from sklearn.svm import LinearSVC
# Split the TFIDFVectorizer data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors, df['Class Index'], test_size=0.2, random_state=42)
# Initialize LinearSVC model
linear_svc_model_tfidf = LinearSVC(max_iter=10000)
# Train the model
linear_svc_model_tfidf.fit(X_train, y_train)
# Make predictions on the test set
y_pred_tfidf_svc = linear_svc_model_tfidf.predict(X_test)
# Print the classification report
print("Classification Report for LinearSVC with TFIDFVectorizer:\n", classification_report(y_test, y_pred_tfidf_svc))
Classification Report for LinearSVC with TFIDFVectorizer:
precision recall f1-score support
1 0.92 0.89 0.90 5956
2 0.95 0.98 0.96 6058
3 0.87 0.88 0.87 5911
4 0.89 0.88 0.88 6075
accuracy 0.91 24000
macro avg 0.91 0.91 0.91 24000
weighted avg 0.91 0.91 0.91 24000
#SGD Classifier and GridSearchCV with Word2Vec
# Importing the SGD Classifier
from sklearn.linear_model import SGDClassifier
# Importing RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
# Split the Word2Vec mean vectors into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(word2vec_mean_vectors, df['Class Index'], test_size=0.2, random_state=42)
from sklearn.metrics import classification_report
# Hyperparameters to search (reduced grid)
param_grid = {
'alpha': [1e-3, 1e-2],
'max_iter': [1000, 5000],
'penalty': ['l2'],
'loss': ['hinge']
}
# Creating a RandomizedSearchCV object with the SGD Classifier
# n_iter is set to 4, so only 4 random combinations will be tried
random_search_word2vec_sgd = RandomizedSearchCV(SGDClassifier(), param_grid, n_iter=4, cv=5, verbose=1, n_jobs=-1)
# Fit the model to the training data
random_search_word2vec_sgd.fit(X_train, y_train)
# Print the best parameters and score
print("Best parameters found:", random_search_word2vec_sgd.best_params_)
# Make predictions on the test set using the best model
y_pred_word2vec_sgd_random = random_search_word2vec_sgd.predict(X_test)
# Print the classification report for the model with the best hyperparameters
print("Classification Report for SGD Classifier with Word2Vec (RandomizedSearchCV):\n", classification_report(y_test, y_pred_word2vec_sgd_random))
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters found: {'penalty': 'l2', 'max_iter': 1000, 'loss': 'hinge', 'alpha': 0.001}
Classification Report for SGD Classifier with Word2Vec (RandomizedSearchCV):
precision recall f1-score support
1 0.89 0.87 0.88 5956
2 0.93 0.97 0.95 6058
3 0.84 0.82 0.83 5911
4 0.84 0.85 0.84 6075
accuracy 0.88 24000
macro avg 0.88 0.88 0.88 24000
weighted avg 0.88 0.88 0.88 24000
#SGD Classifier with GoogleNews Word2Vec
# Split the GoogleNews Word2Vec mean vectors into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(google_w2v_mean_vectors, df['Class Index'], test_size=0.2, random_state=42)
# Initialize SGD Classifier
sgd_model_google_w2v = SGDClassifier(loss='hinge', max_iter=10000)
# Train the model
sgd_model_google_w2v.fit(X_train, y_train)
# Make predictions on the test set
y_pred_google_w2v_sgd = sgd_model_google_w2v.predict(X_test)
# Print the classification report
print("Classification Report for SGD Classifier with GoogleNews Word2Vec:\n", classification_report(y_test, y_pred_google_w2v_sgd))
Classification Report for SGD Classifier with GoogleNews Word2Vec:
precision recall f1-score support
1 0.91 0.87 0.89 5956
2 0.93 0.97 0.95 6058
3 0.83 0.84 0.84 5911
4 0.86 0.84 0.85 6075
accuracy 0.88 24000
macro avg 0.88 0.88 0.88 24000
weighted avg 0.88 0.88 0.88 24000
#Random Forest with CountVectorizer
#Importing the model
from sklearn.ensemble import RandomForestClassifier
# Split the CountVectorizer data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(count_vectors, df['Class Index'], test_size=0.2, random_state=42)
# Initialize Random Forest Classifier with modified hyperparameters
rf_model_count = RandomForestClassifier(n_estimators=50, max_depth=10, max_features='sqrt', random_state=42)
# Train the model
rf_model_count.fit(X_train, y_train)
# Make predictions on the test set
y_pred_count_rf = rf_model_count.predict(X_test)
# Print the classification report
print("Classification Report for Random Forest with CountVectorizer:\n", classification_report(y_test, y_pred_count_rf))
Classification Report for Random Forest with CountVectorizer:
precision recall f1-score support
1 0.81 0.76 0.78 5956
2 0.84 0.88 0.86 6058
3 0.77 0.71 0.74 5911
4 0.69 0.75 0.72 6075
accuracy 0.77 24000
macro avg 0.78 0.77 0.77 24000
weighted avg 0.78 0.77 0.77 24000
#Random Forest with TFIDF Vectorizer
# Applying the Random Forest on Tf-IDF Vectorizer
# Split the TFIDFVectorizer data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors, df['Class Index'], test_size=0.2, random_state=42)
# Initialize Random Forest Classifier with reduced number of trees and limited tree depth
rf_model_tfidf = RandomForestClassifier(n_estimators=50, max_depth=10, max_features='sqrt', random_state=42)
# Train the model
rf_model_tfidf.fit(X_train, y_train)
# Make predictions on the test set
y_pred_tfidf_rf = rf_model_tfidf.predict(X_test)
# Print the classification report
print("Classification Report for Random Forest with TFIDFVectorizer:\n", classification_report(y_test, y_pred_tfidf_rf))
Classification Report for Random Forest with TFIDFVectorizer:
precision recall f1-score support
1 0.80 0.77 0.79 5956
2 0.84 0.88 0.86 6058
3 0.78 0.71 0.74 5911
4 0.69 0.75 0.72 6075
accuracy 0.78 24000
macro avg 0.78 0.78 0.78 24000
weighted avg 0.78 0.78 0.78 24000
# Random Forest with Word2Vec
# Split the Word2Vec mean vectors into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(word2vec_mean_vectors, df['Class Index'], test_size=0.2, random_state=42)
# Initialize Random Forest Classifier with reduced number of trees and limited tree depth
rf_model_word2vec = RandomForestClassifier(n_estimators=50, max_depth=10, max_features='sqrt', random_state=42)
# Train the model
rf_model_word2vec.fit(X_train, y_train)
# Make predictions on the test set
y_pred_word2vec_rf = rf_model_word2vec.predict(X_test)
# Print the classification report
print("Classification Report for Random Forest with Word2Vec:\n", classification_report(y_test, y_pred_word2vec_rf))
Classification Report for Random Forest with Word2Vec:
precision recall f1-score support
1 0.87 0.87 0.87 5956
2 0.92 0.96 0.94 6058
3 0.83 0.82 0.82 5911
4 0.84 0.82 0.83 6075
accuracy 0.87 24000
macro avg 0.87 0.87 0.87 24000
weighted avg 0.87 0.87 0.87 24000
#Random Forest on GoogleNews Word2Vec
# Split the GoogleNews Word2Vec mean vectors into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(google_w2v_mean_vectors, df['Class Index'], test_size=0.2, random_state=42)
# Initialize Random Forest Classifier with reduced number of trees and limited tree depth
rf_model_google_w2v = RandomForestClassifier(n_estimators=50, max_depth=10, max_features='sqrt', random_state=42)
# Train the model
rf_model_google_w2v.fit(X_train, y_train)
# Make predictions on the test set
y_pred_google_w2v_rf = rf_model_google_w2v.predict(X_test)
# Print the classification report
print("Classification Report for Random Forest with GoogleNews Word2Vec:\n", classification_report(y_test, y_pred_google_w2v_rf))
Classification Report for Random Forest with GoogleNews Word2Vec:
precision recall f1-score support
1 0.85 0.86 0.85 5956
2 0.90 0.94 0.92 6058
3 0.80 0.80 0.80 5911
4 0.83 0.78 0.81 6075
accuracy 0.84 24000
macro avg 0.84 0.84 0.84 24000
weighted avg 0.84 0.84 0.84 24000
from sklearn.metrics import accuracy_score
# Accuracy score calculations
accuracy_rf_count = accuracy_score(y_test, y_pred_count_rf)
accuracy_rf_tfidf = accuracy_score(y_test, y_pred_tfidf_rf)
accuracy_rf_word2vec = accuracy_score(y_test, y_pred_word2vec_rf)
accuracy_rf_google_w2v = accuracy_score(y_test, y_pred_google_w2v_rf)
accuracy_svc_count = accuracy_score(y_test,y_pred_count_sgd)
accuracy_svc_tfidf = accuracy_score(y_test, y_pred_tfidf_svc)
accuracy_svc_word2vec = accuracy_score(y_test, y_pred_word2vec_sgd_random)
accuracy_svc_google_w2v = accuracy_score(y_test, y_pred_google_w2v_sgd)
accuracy_lr_count = accuracy_score(y_test, y_pred_count)
accuracy_lr_tfidf = accuracy_score(y_test, y_pred_tfidf)
accuracy_lr_word2vec = accuracy_score(y_test, y_pred_word2vec)
accuracy_lr_google_w2v = accuracy_score(y_test, y_pred_google_w2v)
import plotly.express as px
import pandas as pd
import plotly.io as pio
pio.renderers.default='notebook'
# Prepare data as a list of dictionaries
data = [
# Random Forest
{'Model': 'Random Forest', 'Vectorizer': 'CountVectorizer', 'Accuracy': accuracy_rf_count},
{'Model': 'Random Forest', 'Vectorizer': 'TFIDFVectorizer', 'Accuracy': accuracy_rf_tfidf},
{'Model': 'Random Forest', 'Vectorizer': 'Word2Vec', 'Accuracy': accuracy_rf_word2vec},
{'Model': 'Random Forest', 'Vectorizer': 'GoogleNews Word2Vec', 'Accuracy': accuracy_rf_google_w2v},
# SVM
{'Model': 'SVM', 'Vectorizer': 'CountVectorizer', 'Accuracy': accuracy_svc_count},
{'Model': 'SVM', 'Vectorizer': 'TFIDFVectorizer', 'Accuracy': accuracy_svc_tfidf},
{'Model': 'SVM', 'Vectorizer': 'Word2Vec', 'Accuracy': accuracy_svc_word2vec},
{'Model': 'SVM', 'Vectorizer': 'GoogleNews Word2Vec', 'Accuracy': accuracy_svc_google_w2v},
# Logistic Regression
{'Model': 'Logistic Regression', 'Vectorizer': 'CountVectorizer', 'Accuracy': accuracy_lr_count},
{'Model': 'Logistic Regression', 'Vectorizer': 'TFIDFVectorizer', 'Accuracy': accuracy_lr_tfidf},
{'Model': 'Logistic Regression', 'Vectorizer': 'Word2Vec', 'Accuracy': accuracy_lr_word2vec},
{'Model': 'Logistic Regression', 'Vectorizer': 'GoogleNews Word2Vec', 'Accuracy': accuracy_lr_google_w2v},
]
# Convert to DataFrame
df_plot = pd.DataFrame(data)
# Create a sunburst plot
fig = px.sunburst(df_plot, path=['Model', 'Vectorizer'], values='Accuracy',
title='Comparison of Accuracy for Different Models and Vectorizers',
color='Accuracy',
color_continuous_scale='viridis',template='simple_white')
# Show the plot
fig.show()